Code
# import software libraries / dependencies
import nltk
import os
import re
import matplotlib.pyplot as plt
from collections import Counter
from nltk.stem import SnowballStemmer
nltk.download('punkt') # Download NLTK tokenizer data
stemmer = SnowballStemmer("english") # Initialize SnowballStemmer for English
# create function to preprocess and process files of each directory
def process_directory(corpus_directory):
corpus = []
# Get the directory name as the label
label = os.path.basename(corpus_directory)
# Step 1: Get the text of corpus from files
for filename in os.listdir(corpus_directory):
with open(os.path.join(corpus_directory, filename), 'r', encoding='utf-8') as file:
text = file.read()
corpus.append(text)
# Step 2: Tokenize the text blob into individual words
tokenized_corpus = [nltk.word_tokenize(text) for text in corpus]
# Step 3: Standardize (lower) case, find words that start with re-, stem those words, retain them
stemmed_corpus = []
for tokens in tokenized_corpus:
stemmed_tokens = [stemmer.stem(word.lower()) for word in tokens if re.match(r'\b(re-)\w+', word.lower())]
stemmed_corpus.append(stemmed_tokens)
# Step 6: Count the frequency of each re- word
word_counts = Counter(word for tokens in stemmed_corpus for word in tokens)
# Step 7: Display the most frequent words
# most_common_re_words = word_counts.most_common(20) # Set the desired number of top words
# for word, count in most_common_re_words:
# print(f'The poetry of {label[:-5]} uses {word}: {count} times')
# Step 8: Plot words and counts in a bar chart
plt.figure(figsize=(10, 5))
top_words, top_counts = zip(*word_counts.most_common(20))
plt.bar(top_words, top_counts)
plt.title(f'Frequency of Stemmed Re- Words in {label[:-5].capitalize()}\'s Poetry')
plt.xticks(rotation=65)
plt.show()
# Directories to process
corpus_directories = [
'/home/adammazel/Documents/Digital_Scholarship/re-victorian-poetry/cta/swinburne/swinburne_noBP',
'/home/adammazel/Documents/Digital_Scholarship/re-victorian-poetry/cta/hardy/hardy_noBP',
'/home/adammazel/Documents/Digital_Scholarship/re-victorian-poetry/cta/field/field_NoBP',
'/home/adammazel/Documents/Digital_Scholarship/re-victorian-poetry/cta/rossetti_dg/rossetti_dg_NoBP',
# Add more directories here
]
# Process each directory
for directory in corpus_directories:
process_directory(directory)